home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
X11
/
wais
/
waisgate
/
sersrch.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-05-09
|
16KB
|
599 lines
/* WIDE AREA INFORMATION SERVER SOFTWARE
No guarantees or restrictions. See the readme file for the full standard
disclaimer.
Brewster@think.com
*/
/* implements the search part of irext.h
(search_word and finished_search_word)
-brewster
Split from irsearch.c
5/31/91 Added scale_scores. Fixed document_score_array to long.
7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
2/4/92 Made document_score_array a double.
- Jonny G
* $Log: sersrch.c,v $
* Revision 1.24 92/04/28 16:56:54 morris
* added boolean to serial engine
*
* Revision 1.23 92/03/15 10:15:18 jonathan
* Added Simon Spero's ASSIGN replacement for read_bytes.
*
* Revision 1.22 92/03/05 07:09:54 shen
* add two more dummy arguments to call to init_search_engine
*
* Revision 1.21 92/02/12 17:29:52 jonathan
* Conditionalized inclusion of object code.
*
* Revision 1.20 92/02/12 13:40:06 jonathan
* Added "$Log" so RCS will put the log message in the header
*
*/
#include "cdialect.h"
#include "irfiles.h"
#include "irsearch.h"
#include "irext.h"
#include "byte_order.h"
#include <string.h>
#ifdef BOOL
#include "obj.h"
#include "irparse.h"
object* currentQuery = NULL; /* kludge until irext goes away */
#endif /* def BOOL */
/* weighting for relevant document terms -
this may become a parameter to the query.
*/
#define RF_WEIGHTING 0.1
/* ==================================
* === Initialization Functions ===
* ==================================*/
long init_search_engine(file, initialize, for_search, cm_mem_percent,
text_size, grow_percent)
char* file;
boolean initialize;
boolean for_search;
long cm_mem_percent; /* unused */
long text_size; /* unused */
long grow_percent; /* unused */
{
static boolean inited = false;
if (inited == false)
{
#ifdef BOOL
initObj();
initBool();
#endif
inited = true;
}
return(0);
}
long finished_search_engine()
{
return(0);
}
/*
* ext_open_database: see irext.h
*/
long ext_open_database (db, initialize, for_search)
database *db;
boolean initialize;
boolean for_search;
{ /* this has to deal with the .inv file */
char file[MAX_FILE_NAME_LEN];
if(initialize) /* make a new one */
db->index_stream = s_fopen(index_filename(file, db), "w+b");
else if(for_search) /* just search */
db->index_stream = s_fopen(index_filename(file, db), "rb");
else /* write to an existing db */
db->index_stream = s_fopen(index_filename(file, db), "r+b");
if (db->index_stream == NULL) {
waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n",
file);
disposeDatabase(db);
return(1);
}
return(0);
}
/*
* ext_close_database: see irext.h
*/
long ext_close_database (db)
database *db;
{
return(0);
}
char *database_file(database_name)
char *database_name;
{
return(database_name);
}
/*===========================*
*=== Setting Paramters ===*
*===========================*/
long max_hit_retrieved = 0;
char **srcs = NULL;
long set_query_parameter (mask, parameters)
long mask;
query_parameter_type * parameters;
{
switch (mask)
{
case SET_MAX_RETRIEVED_MASK:
max_hit_retrieved = parameters->max_hit_retrieved;
return(0);
break;
case SET_SELECT_SOURCE:
if(NULL != srcs){
if(NULL != srcs[0])
s_free(srcs[0]);
s_free(srcs);
}
srcs = parameters->srcs;
break;
default:
return(-1);
break;
}
return(0);
}
/*==============================*
*=== Document Score Array ===*
*==============================*/
double *document_score_array = NULL;
long document_score_array_len = 0;
/* make_document_score_array insures that the document_score_array
array is long enough, if not it makes it long enough */
static void make_document_score_array _AP((long length));
static void make_document_score_array(length)
long length;
{
if(length <= document_score_array_len)
return;
/* we have to make a new one. free the old one first (if any) */
if(document_score_array != 0){
s_free(document_score_array);
}
document_score_array = (double*)s_malloc(
(size_t)(length * sizeof(double)));
document_score_array_len = length;
}
static void destroy_document_score_array _AP((void));
static void destroy_document_score_array()
{
s_free(document_score_array);
document_score_array_len = 0;
}
void clear_document_score_array()
/* side effects the document_score_array. */
{
memset(document_score_array, 0,
document_score_array_len * sizeof(double));
}
/* for debugging purposes */
void print_document_score_array(start,stop)
unsigned long start;
unsigned long stop;
/* assumes start >= 0, stop < db->doc_table_allocated_entries */
{
long i;
for(i = start; i <= stop; i++){
printf("entry number %d: %f \n",
i, document_score_array[i]);
}
}
/*=========================*
*=== Best Hits Array ===*
*=========================*/
hit *best_hits_array = NULL;
long best_hits_array_len = 0;
long current_best_hit = 0;
/* see irext.h for doc */
long init_best_hit (db)
database *db;
{
#ifdef BOOL
if (currentQuery != NULL)
send(currentQuery,InitBestHit,db);
#endif /* def BOOL */
return(0);
}
/* make_best_hits_array insures that the best_hits_array
array is long enough, if not it makes it long enough */
static void make_best_hits_array _AP((long length));
static void make_best_hits_array(length)
long length;
{
if(length <= best_hits_array_len)
return;
/* we have to make a new one. free the old one first (if any) */
if(best_hits_array != 0){
s_free(best_hits_array);
}
best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
best_hits_array_len = length;
}
static void destroy_best_hits_array _AP((void));
static void destroy_best_hits_array()
{
s_free(best_hits_array);
best_hits_array_len = 0;
}
void clear_best_hits_array()
/* side effects the best_hits_array. XXX could use memset */
{
memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
}
/* for debugging purposes */
void print_best_hits()
{
long i;
for( i = 0; i < best_hits_array_len; i++){
if (best_hits_array[i].weight != 0)
{ printf("Best hit %ld: weight %ld, doc_id %ld, headline %s, filename %s, lines %ld\n",
i, best_hits_array[i].weight,
best_hits_array[i].document_id,
best_hits_array[i].headline,
best_hits_array[i].filename,
best_hits_array[i].number_of_lines);
}
}
}
void sort_best_hits(db)
database * db;
{
/* returns nothing.
* side effects best_hits and document_score_array
*/
long i, doc;
double worst_weight_to_make_it = 0.0;
document_table_entry doc_entry;
long best_hit_number = 0;
/* snuff the scores */
for(i = 0; i < max_hit_retrieved; i++){
best_hits_array[i].weight = 0;
}
/* loop over the doc, and keep the doc_id and weight in best hit table */
for(doc = 0; doc < db->doc_table_allocated_entries; doc++){
double weight = document_score_array[doc];
if(worst_weight_to_make_it < weight){
/* merge it into the best_hits array. start at the bottom */
for(i = (max_hit_retrieved - 1); i >= 0; i--){
if(weight > best_hits_array[i].weight
/* && (check_document_id(doc, db) == true) too slow.*/
){
/* move this entry down */
if((i + 1) < max_hit_retrieved){
best_hits_array[i+1].weight = best_hits_array[i].weight;
best_hits_array[i+1].document_id = best_hits_array[i].document_id;
}
best_hits_array[i].document_id = doc;
best_hits_array[i].weight = weight;
}
else
break;
}
}
}
for(i = 0; i < max_hit_retrieved; i++){
if(best_hits_array[i].weight <= 0) /* if we are out of good stuff, return */
return;
/* fill in the rest of the hit */
if (read_document_table_entry(&doc_entry,
best_hits_array[i].document_id,
db)
== true){
best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
best_hits_array[best_hit_number].start_character = doc_entry.start_character;
best_hits_array[best_hit_number].end_character = doc_entry.end_character;
best_hits_array[best_hit_number].document_length = doc_entry.document_length;
best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
read_filename_table_entry(doc_entry.filename_id,
best_hits_array[best_hit_number].filename,
best_hits_array[best_hit_number].type,
NULL,
db),
strncpy(best_hits_array[best_hit_number].headline,
read_headline_table_entry(doc_entry.headline_id,db),
MAX_FILE_NAME_LEN);
best_hit_number++;
}
beFriendly();
}
for(i = best_hit_number; i < max_hit_retrieved; i++){
best_hits_array[best_hit_number].weight = 0;
}
/* print_best_hits(s); for debugging */
}
/* returns the next best hit */
long best_hit(db, doc_id, best_character, best_line, score)
database *db;
long *doc_id;
long *best_character;
long *best_line;
long *score;
{
*best_character = 0;
*best_line = 0;
#ifdef BOOL
if (currentQuery != NULL) /* for boolean */
{
send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
if (*doc_id > 0)
return(0); /* ok */
else
return(-1); /* no more docs */
}
#endif /* BOOL */
if(current_best_hit > best_hits_array_len)
return(1);
if(best_hits_array[current_best_hit].weight == 0)
return(1);
*doc_id = best_hits_array[current_best_hit].document_id;
*score = best_hits_array[current_best_hit].weight;
current_best_hit++;
return(0);
}
long finished_best_hit(db)
database *db;
{
#ifdef BOOL
if (currentQuery != NULL) /* for boolean */
{ send(currentQuery,Delete);
currentQuery = NULL;
return(0);
}
#endif /* BOOL */
/* if we are on a small machine, we might want to
destroy_document_score_array */
clear_document_score_array();
clear_best_hits_array();
current_best_hit = 0;
return(0);
}
/*=============================*
*=== Searching for words ===*
*=============================*/
/* see irext.h for doc */
long init_search_word (db)
database* db;
{
return(0);
}
/* see irext.h for doc */
long search_word(word,char_pos, line_pos, weight, doc_id,
word_pair, db)
char *word; /* the word to be searched for */
long char_pos; /* the position of the start of the word */
long line_pos; /* is this needed? not for signature system */
long weight; /* how important the word looks syntactically,
such as is it bold */
long doc_id; /* current document, seed words is 0,
then it increments into the relevant
document */
long word_pair;
database *db;
{
/* this side effects the document_score_array,
* and downcases the word.
* Returns 0 if successful or word not present,
* returns non-0 if an error.
*
*/
long not_full_flag = INDEX_BLOCK_FULL_FLAG; /*start out full so it will go on looking */
long count, index_block_size;
long internal_document_id, internal_weight, number_of_valid_entries;
long index_file_block_number;
long number_of_occurances;
FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
char *index_buffer;
char *i;
FILE *stream = NULL;
index_buffer = (char*)index_buffer_data;
index_file_block_number =
look_up_word_in_dictionary(word, &number_of_occurances, db);
current_best_hit = 0; /* so that the best hits willstart from 0 */
/* check the document_score_array */
if(document_score_array_len < db->doc_table_allocated_entries)
make_document_score_array(db->doc_table_allocated_entries);
if(index_file_block_number >= 0){
stream = db->index_stream;
while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) &&
(index_file_block_number != 0)){
/* read the index block */
if (0 != fseek(stream, (long)index_file_block_number,
SEEK_SET))
{
waislog(WLOG_HIGH, WLOG_ERROR,
"fseek failed into the inverted file to position %ld",
(long)index_file_block_number);
return(-1);
}
read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);
ASSIGN(not_full_flag,
INDEX_BLOCK_FLAG_SIZE,
index_buffer,
INDEX_BLOCK_HEADER_SIZE,
0 );
ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
index_buffer+INDEX_BLOCK_FLAG_SIZE,
INDEX_BLOCK_HEADER_SIZE,
INDEX_BLOCK_FLAG_SIZE);
ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
INDEX_BLOCK_HEADER_SIZE,
INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);
/*
this is equivalent, but slower:
not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
printf("flag = %d, block_num = %d, block_size = %d\n",
not_full_flag,
index_file_block_number,
index_block_size);
fflush(stdout);
*/
if(EOF == index_block_size)
{
waislog(WLOG_HIGH, WLOG_ERROR,
"reading from the index file failed");
return(-1);
}
if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
/* not full */
number_of_valid_entries = index_file_block_number;
}
else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
/* full */
number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
}
else{ /* bad news, file is corrupted. */
waislog(WLOG_HIGH, WLOG_ERROR,
"Expected the flag in the inverted file to be valid. it is %ld",
not_full_flag);
return(-1);
}
/* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
/* add the array to the document_score_array */
number_of_valid_entries /= INDEX_ELEMENT_SIZE;
for(count=0;count < number_of_valid_entries;count++) {
int wgt;
int did;
if(count%1024 == 0) {
read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
MIN(1024,number_of_valid_entries-count));
i=index_buffer;
}
ASSIGN(wgt,WEIGHT_SIZE,
i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
INDEX_ELEMENT_SIZE,
DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);
internal_weight = wgt;
internal_document_id = did;
/*
printf("entry %ld, Doc_id: %ld, weight %ld \n",
count, internal_document_id, internal_weight);
fflush(stdout);
*/
if(EOF == internal_weight)
{
waislog(WLOG_HIGH, WLOG_ERROR,
"reading from the doc-id table failed");
return(-1);
}
/* if(doc_id > 0) we are doing a relevant document */
document_score_array[internal_document_id] +=
(doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
i+=INDEX_ELEMENT_SIZE;
}
}
return(0);
}
else if(0 == index_file_block_number){
/* an error occurred on looking up the word */
return(-1);
}
else /* index_file_block_number is negative */
return(0); /* word not present */
}
/* now collect the best hits */
long finished_search_word(db)
database *db;
{
#ifdef BOOL
if (currentQuery != NULL)
return; /* do nothing for boolean */
#endif /* def BOOL */
/* check the document_score_array */
if(document_score_array_len < db->doc_table_allocated_entries)
make_document_score_array(db->doc_table_allocated_entries);
make_best_hits_array(max_hit_retrieved);
sort_best_hits(db);
return(0);
}